library(showtext)
showtext_auto(enable = TRUE)
font_add("自訂字體", "/Users/mac/Documents/in_sync_mac/in_sync_documents/fonts/TaipeiSansTCBeta-Regular.ttf")
library(tidyverse)
library(mgcv)
library(psych)

set.seed(1)
label <- "/Users/mac/Documents/GitHub/GIL/CCKF/dietcoke/data/chunked_freq/20211115-104047-dispersion/"

Data preparation

library(arrow)

fp <- paste0(label, "gam_df.parquet")
if(file.exists(fp)){df <- read_parquet(fp)}
df$text_slice <- as.numeric(df$text_slice)
any(is.na(df$mid_year)) == FALSE
## [1] TRUE
all(!is.na(df$mid_year)) == TRUE
## [1] TRUE
"wb114254" %in% df$urn == FALSE
## [1] TRUE
length(unique(df$urn))
## [1] 63
# Over-dispersed characters from the results of vocabulary growth curve
["子","曰","者","於","為","有","其","人","一","而","以","也","不","之"] 

# Subsets of characters from 蔣(2005)
["布;燈", "鐘;錶;磐;篪", "槍;刀;劍;戟;炮", "鯨", "心", "城;池", "快;慢", "籽", "日;山;涉;踄", "大;小;高", "貫;毌;擐;關", "矢;誓", "歌;唱;和"]

# Over-dispersed mid-frequency characters 
['避', '聚', '助', '縱', '殊', '假', '戒', '竟', '辨', '委', '屈', '負', '違', '仰', '庫', '畏', '屢', '惜', '逐', '符', '側', '覺', '貫', '稽', '稍']
# Under-dispersed mid-frequency characters
['瑭', '尿', '酳', '皝', '嚐', '緞', '痘', '礼', '楽', '与', '鄩', '爹', '獘', '𣗳', '𧰼', '𫠦', '淂', '縀', '媽', '𡻕', '啊', '捌', '啥', '㭍', '叁']
chars <- unique(df$char)
char_df <- data.frame(char=chars, char_id=1:length(chars))
df <- left_join(df, char_df)
## Joining, by = "char"

Cleaning

kdf <- df %>%
  select(mid_year, urn, text_slice) %>%
  distinct() %>%
  arrange(mid_year, urn, text_slice) %>%
  mutate(k=row_number())

dynaspan_levels <- c("先秦", "漢", "魏晉南北", "唐五代十國", "宋元", "明", "清", "民國")
dynaspan_yearto_levels <- c(-221, 220, 589, 1125, 1368, 1644, 1911, 1949)
dynaspan_df <- data.frame(
  dynaspan=dynaspan_levels,
  dynaspan_yearto=dynaspan_yearto_levels
)

df <- left_join(df, kdf) %>%
  left_join(dynaspan_df)
## Joining, by = c("urn", "text_slice", "mid_year")
## Joining, by = "dynaspan"
df$dynaspan <- factor(df$dynaspan, levels=dynaspan_levels)
var(df$raw_freq)
## [1] 5.150873
mean(df$raw_freq)
## [1] 0.9699206
ppois(0, mean(df$raw_freq)) < mean(df$raw_freq == 0)
## [1] TRUE
df %>% filter(raw_freq == 0) %>% nrow() / nrow(df)
## [1] 0.6747718
df$logfreq <- log(df$raw_freq+1)

ggplot(df, aes(raw_freq)) + geom_histogram(binwidth=1)

ggplot(df, aes(logfreq)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

par(mfrow=c(2,1))
plot(density(df$raw_freq))
plot(density(df$logfreq))

par(mfrow=c(2,1))

mid_year_distribution <- df %>%
  select(mid_year, urn, text_slice) %>%
  distinct() %>%
  group_by(mid_year) %>%
  count() %>%
  arrange(desc(n))

plot(density(mid_year_distribution$mid_year))
abline(v=dynaspan_yearto_levels, col="lightblue")
text(dynaspan_yearto_levels, 0.000025,  dynaspan_levels,
     cex=0.65, pos=2,col="lightblue") 
rug(mid_year_distribution$mid_year, ticksize=-0.1, col="blue") # side=3

mid_year_distribution$mid_year_scaled <- scale(mid_year_distribution$mid_year)
hist(mid_year_distribution$mid_year_scaled)

df %>% select(dynaspan, k, urn) %>% distinct() %>%
  group_by(dynaspan) %>%
  summarize(`Number of texts`=length(unique(urn)), `Number of text slices`=length(k))
## # A tibble: 8 × 3
##   dynaspan   `Number of texts` `Number of text slices`
##   <fct>                  <int>                   <int>
## 1 先秦                       1                      12
## 2 漢                        10                     499
## 3 魏晉南北                  10                     218
## 4 唐五代十國                10                     248
## 5 宋元                      10                     312
## 6 明                        10                     293
## 7 清                        10                     178
## 8 民國                       2                     256
library(readr)
write_csv(df, "gam_df2.csv")
head(df)
## # A tibble: 6 × 14
##   dynaspan char  raw_freq urn      text_slice dzg      title mid_year author_norm
##   <fct>    <chr>    <int> <chr>         <dbl> <chr>    <chr>    <dbl> <chr>      
## 1 先秦     避           1 wb422571          0 道藏/藏… 列子      -412 列御寇     
## 2 先秦     啥           0 wb422571          7 道藏/藏… 列子      -412 列御寇     
## 3 先秦     㭍           0 wb422571          7 道藏/藏… 列子      -412 列御寇     
## 4 先秦     叁           0 wb422571          7 道藏/藏… 列子      -412 列御寇     
## 5 先秦     避           0 wb422571          8 道藏/藏… 列子      -412 列御寇     
## 6 先秦     聚           4 wb422571          8 道藏/藏… 列子      -412 列御寇     
## # … with 5 more variables: __index_level_0__ <int>, char_id <int>, k <int>,
## #   dynaspan_yearto <dbl>, logfreq <dbl>
df %>%
  keep(is.numeric) %>%
  lowerCor()
##                   rw_fr txt_s md_yr ____0 chr_d k     dyns_ lgfrq
## raw_freq           1.00                                          
## text_slice         0.01  1.00                                    
## mid_year           0.03  0.16  1.00                              
## __index_level_0__  0.00  0.22 -0.21  1.00                        
## char_id            0.01  0.00  0.00  0.00  1.00                  
## k                  0.02  0.20  0.99 -0.18  0.00  1.00            
## dynaspan_yearto    0.02  0.09  0.96 -0.34  0.00  0.96  1.00      
## logfreq            0.86  0.01  0.05 -0.02  0.03  0.04  0.03  1.00
#df %>%
#  keep(is.numeric) %>%
#  cor()

Dispersion of characters ordered by frequency rank and text slice of 100K characters Dispersion of characters ordered by frequency rank and text slice of 100K characters

# culmulative number of text slices
[12, 687, 1868, 4094, 12537, 20026, 45804, 46060]
df %>%
  filter(raw_freq>0) %>%
  ggplot(aes(k, char_id)) +
  geom_point(size=0.5) +
  theme_minimal() +
  scale_y_continuous(breaks=1:length(unique(df$char)), labels=paste0(1:length(chars), "_", chars)) +
  labs(x="Text slice", y="Character")

clustering

folder <- paste0(label, "gam_models")
models <- lapply(list.files(folder, full=TRUE), readRDS)
gam_df <- data.frame(
    formula=lapply(models, function(x){
    formula <- as.character(x$formula)
    formula <- paste(formula[[2]], formula[[1]], formula[[3]])
    return(formula)
  }) %>% unlist(),
  AIC=lapply(models, AIC) %>% unlist(),
  BIC=lapply(models, BIC) %>% unlist()
) %>%
  mutate(model_id=row_number()) %>%
  arrange(AIC, formula) %>%
  select(model_id, AIC, BIC, formula); gam_df
##   model_id      AIC      BIC
## 1        1 204915.3 208330.1
## 2        3 209323.2 215949.7
## 3        2 213227.2 219325.5
##                                                                       formula
## 1              raw_freq ~ s(k, by = char, id = 1, m = 1) + s(char, bs = "re")
## 2 raw_freq ~ s(k, char, bs = "fs", m = 1, k = 20) + s(author_norm, bs = "re")
## 3                             raw_freq ~ s(k, char, bs = "fs", m = 1, k = 20)
coef_mats <- lapply(models, function(x){
  coefs <- coef(x)
  coefs <- coefs[2:length(coefs)]
  coef_mat <- matrix(coefs, nrow=length(chars))
  return(coef_mat)
})
## Warning in matrix(coefs, nrow = length(chars)): data length [1056] is not a sub-
## multiple or multiple of the number of rows [50]
coef_dfs <- lapply(coef_mats, function(x){
  coef_df <- data.frame(x)
  rownames(coef_df) <- paste0(1:length(chars), "_", chars)
  return(coef_df)
})

model <- models[[1]]

model$formula
## raw_freq ~ s(k, by = char, id = 1, m = 1) + s(char, bs = "re")
print(summary(model$gam))
## Length  Class   Mode 
##      0   NULL   NULL
coef_mat <- coef_mats[[1]]
coef_df <- coef_dfs[[1]]

Distance (Dissimilarity)

library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
res.dist <- get_dist(coef_df, stand=FALSE, method = "euclidean")

fviz_dist(res.dist, 
   gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))

K-means

for(method in c("silhouette", "wss", "gap_stat")){
  print(fviz_nbclust(coef_mat, kmeans, method=method))
}

km.res <- kmeans(coef_df, 2, nstart = 25)
fviz_cluster(km.res, data = coef_df,
             palette = "jco", ggtheme = theme_minimal(), geom="text")

PAM

library(cluster)

pam.res <- pam(coef_df, 2)
fviz_cluster(pam.res,
             palette = "jco", ggtheme = theme_minimal(), geom="text")

hierarchical clustering

res.hc <- hclust(dist(coef_df), method="ward.D2")

fviz_dend(res.hc, k = 2,
          cex = 0.5,
          k_colors = c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07"),
          color_labels_by_k = TRUE,
          rect = TRUE
          )
## Warning in get_col(col, k): Length of color vector was longer than the number of
## clusters - first k elements are used
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

res.hc <- coef_df %>%
  eclust("hclust", k = 20, graph = FALSE)

fviz_dend(res.hc, 
          cex = 0.5, palette = "jco", rect = TRUE)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

fviz_silhouette(res.hc)
##    cluster size ave.sil.width
## 1        1    2          0.32
## 2        2    2          0.44
## 3        3    1          0.00
## 4        4   10          0.56
## 5        5    4          0.36
## 6        6    7          0.64
## 7        7    3          0.61
## 8        8    1          0.00
## 9        9    3          0.43
## 10      10    1          0.00
## 11      11    1          0.00
## 12      12    1          0.00
## 13      13    3          0.10
## 14      14    2          0.71
## 15      15    1          0.00
## 16      16    1          0.00
## 17      17    2          0.38
## 18      18    3          0.36
## 19      19    1          0.00
## 20      20    1          0.00

# Silhouette width of observations
sil <- res.hc$silinfo$widths[, 1:3]

# Objects with negative silhouette
neg_sil_index <- which(sil[, 'sil_width'] < 0)
sil[neg_sil_index, , drop = FALSE]
##       cluster neighbor   sil_width
## 44_側      13        4 -0.09563395

fuzzy clustering

res.fanny <- fanny(coef_df, k=2)
## Warning in fanny(coef_df, k = 2): FANNY algorithm has not converged in 'maxit' =
## 500 iterations
## Warning in fanny(coef_df, k = 2): the memberships are all very close to 1/k.
## Maybe decrease 'memb.exp' ?
fviz_cluster(res.fanny,
             palette = "jco", ggtheme = theme_minimal())

fviz_silhouette(res.fanny,
                palette = "jco", ggtheme = theme_minimal())
##   cluster size ave.sil.width
## 1       1   25          0.44
## 2       2   25          0.09

density-based clustering

library(fpc)

db <- fpc::dbscan(coef_df, eps = 5, MinPts = 2)
fviz_cluster(db, data = coef_df, stand = FALSE,
             geom = "point", palette = "jco", ggtheme = theme_classic())

fviz_cluster(db, data = coef_df, stand = FALSE,
             geom = "text", palette = "jco", ggtheme = theme_classic())

dbscan::kNNdistplot(coef_df, k =  5)

ordered dissimilarity image (ODI)
gradient.color <- list(low = "steelblue",  high = "white")

coef_mat %>%
  get_clust_tendency(n = 2, gradient = gradient.color)
## $hopkins_stat
## [1] 0.8442724
## 
## $plot

Source: https://www.datanovia.com/en/blog/types-of-clustering-methods-overview-and-quick-start-r-code/

library(lattice)

df <- df %>%
  separate(dzg, into=c("dzg_lev1", "dzg_lev2"), sep="/", remove=FALSE) %>%
  mutate(char_label=paste0(char_id, "_", char), char=char_id)
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 22500 rows [3113,
## 3114, 3115, 3116, 3117, 3118, 3119, 3120, 3121, 3122, 3123, 3124, 3125, 3126,
## 3127, 3128, 3129, 3130, 3131, 3132, ...].
df$char <- factor(df$char)
df$author_norm <- factor(df$author_norm)
df$dzg <- factor(df$dzg)
df$dzg_lev1 <- factor(df$dzg_lev1)

df$fit <- predict.gam(model, df)

new_df <- df %>%
  group_by(char, char_id) %>%
  summarize(fano=var(logfreq, na.rm=TRUE)/mean(logfreq, na.rm=TRUE))
## `summarise()` has grouped output by 'char'. You can override using the `.groups` argument.
new_df %>% arrange(desc(fano))
## # A tibble: 50 × 3
## # Groups:   char [50]
##    char  char_id  fano
##    <fct>   <int> <dbl>
##  1 7           7  3.89
##  2 23         23  3.33
##  3 25         25  2.33
##  4 30         30  2.31
##  5 24         24  2.13
##  6 38         38  1.97
##  7 22         22  1.92
##  8 50         50  1.75
##  9 31         31  1.48
## 10 17         17  1.39
## # … with 40 more rows
fano_quantiles <- quantile(new_df$fano, probs=c(0.1, 0.9), na.rm=TRUE)
indices_low <- new_df %>% filter(fano < fano_quantiles[1]) %>% pull(char_id)
indices_high <- new_df %>% filter(fano > fano_quantiles[2]) %>% pull(char_id)
par(mfrow=c(2,3))
for(i in indices_low){
  plot(model, select=i, main=chars[i])
}

par(mfrow=c(2,3))
for(i in indices_high){
  plot(model, select=i, main=chars[i])
}

options(plot.repr.height=12, plot.repr.width=12)
xyplot(logfreq ~ k|char_label, data=df, type="l")

bursty_df <- df %>%
  group_by(char, char_id) %>%
  summarize(burstiness=(logfreq/sum(logfreq, na.rm=TRUE))-(1/n())) %>%
  mutate(is_bursty_period=burstiness > 0) %>%
  arrange(desc(burstiness)) %>%
  group_by(char, char_id) %>%
  summarize(burstiness_median=median(burstiness)) %>%
  arrange(desc(burstiness_median))
## `summarise()` has grouped output by 'char', 'char_id'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'char'. You can override using the `.groups` argument.
bursty_chars <- bursty_df %>% select(char_id) %>% distinct() %>% head(6) %>% pull(char_id)
## Adding missing grouping variables: `char`
par(mfrow=c(2,3))
for(i in bursty_chars){
  plot(model, select=i, main=chars[i])
}

set.seed(1)

author_profile <- read_csv("/Users/mac/Documents/GitHub/GIL/CCKF/dietcoke/data/author_time/author_profile.csv") %>%
  filter(!is.na(mid_year)) %>%
  mutate(mid_year_grp=mid_year %/% 100) %>%
  group_by(mid_year_grp) %>%
  slice_sample(n=3)
## Warning: Missing column names filled in: 'X1' [1]
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   X1 = col_double(),
##   name = col_character(),
##   birth_year = col_double(),
##   mort_year = col_double(),
##   mid_year = col_double(),
##   age = col_double(),
##   urn = col_character(),
##   title = col_character(),
##   dynaspan = col_character(),
##   year_source = col_character()
## )
par(mfrow=c(2,1))
hist(author_profile$mid_year)

plot(density(author_profile$mid_year))
abline(v=dynaspan_yearto_levels, col="lightblue")
text(dynaspan_yearto_levels, 0.000025,  dynaspan_levels,
     cex=0.65, pos=2,col="lightblue") 
rug(author_profile$mid_year, ticksize=-0.1, col="blue")

df %>% select(dynaspan, k, urn) %>% distinct() %>%
  group_by(dynaspan) %>%
  summarize(`Number of texts`=length(unique(urn)), `Number of text slices`=length(k))
## # A tibble: 8 × 3
##   dynaspan   `Number of texts` `Number of text slices`
##   <fct>                  <int>                   <int>
## 1 先秦                       1                      12
## 2 漢                        10                     499
## 3 魏晉南北                  10                     218
## 4 唐五代十國                10                     248
## 5 宋元                      10                     312
## 6 明                        10                     293
## 7 清                        10                     178
## 8 民國                       2                     256